The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
%matplotlib inline
from sklearn import metrics
from scipy.stats import zscore
from sklearn import model_selection
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix
# loading the data
dataFilePath = './vehicle.csv'
data_df= pd.read_csv(dataFilePath)
data_df.head()
# shape of the data
rows, cols = data_df.shape
print (f"There are {rows} observations / rows and {cols} attributes / columns")
# Data type of each attribute
data_df.info() # Info on each of the attribute
data_df.columns
# determine if any value is null
data_df.isnull().values.any()
# determine the sum of na values
data_df.apply(lambda x : sum(x.isnull()))
#List all the rows having missing value in any of the single or multiple columns
missing_values_cols=data_df.columns[data_df.isnull().any()]
data_df[data_df.isnull().any(axis=1)][missing_values_cols].head()
missingRows, missingCols = data_df[data_df.isnull().any(axis=1)][missing_values_cols].shape
print (f"There are {missingRows} rows with {missingCols} missing columns")
# Eye ball the data
data_df.describe().transpose()
# Make a copy of dataframe before replacing missing values with median; just in case the original dataframe requires to be revisited
data_df_Original = data_df.copy()
data_df.fillna(data_df.median(),axis=0,inplace=True)
data_df.info()
# Eye ball the data
data_df.describe().transpose()
plt.figure(figsize= (30,20)) # Set the figure size
pos = 1 # a variable to manage the position of the subplot in the overall plot
totalrows, totalcols = data_df.shape
categorical_vars = data_df.describe(include=["object"]).columns
continuous_vars = data_df.describe(exclude=['object']).columns
for feature in data_df.columns:
plt.subplot(7, 3, pos) # plot grid
if feature in continuous_vars: # Plot histogram for all the continuous columns
sns.distplot(data_df[feature], kde= True,rug = True )
if feature in categorical_vars:
sns.countplot(data_df[feature], palette= 'Blues') # Plot bar chart for all the categorical columns
#sns.barplot(data_df[feature].value_counts().values, data_df[feature].value_counts().index)
pos += 1 # to plot over the grid one by one
# List numerical columns
numerical_cols = list(data_df.select_dtypes(exclude=['object']))
_= data_df.hist(column=numerical_cols, figsize = (16,16))
print(f"Skweness is: \n{data_df.skew()}")
numerical_cols = list(data_df.select_dtypes(exclude=['object']))
_= data_df.boxplot(column=numerical_cols, figsize = (20,20))
plt.figure(figsize= (20,40))
continuous_vars = data_df.describe(exclude=['object']).columns
pos = 1
for feature in continuous_vars:
plt.subplot(10, 2, pos) # plot grid
sns.boxplot(data_df[feature])
# plt.title(feature)
pos += 1 # to plot over the grid one by one
# function to determine outliers of given series
def determineAndReplaceOutliers(aSeries, columnName):
q1 = aSeries.quantile(0.25)
q3 = aSeries.quantile(0.75)
iqr = q3-q1 #Interquartile range
threshold_low = q1-1.5*iqr
threshold_high = q3+1.5*iqr
outliers_low = aSeries[(aSeries < threshold_low)]
outliers_high= aSeries[(aSeries > threshold_high)]
#print ("Q1: ", q1)
print(f'threshold low: {threshold_low} \nlow outliers count =', outliers_low.count())
#print('Outliers low: \n')
#print(outliers_low)
data_df.loc[outliers_low.index, columnName] = threshold_low
print(f'replaced {outliers_low.count()} low outliers with (q1-1.5*iqr) = {threshold_low}')
#print ("Q3: ", q3)
print(f'threshold high: {threshold_high} \nhigh outliers count = {outliers_high.count()}')
#print('Outliers high: \n')
#print(outliers_high)
data_df.loc[outliers_high.index, columnName] = threshold_high
print(f'replaced {outliers_high.count()} high outliers with (q1+1.5*iqr) = {threshold_high}')
return threshold_low, outliers_low, threshold_high, outliers_high
columnName = 'radius_ratio'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'pr.axis_aspect_ratio'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'max.length_aspect_ratio'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'max.length_aspect_ratio'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'scaled_variance'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'scaled_variance.1'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'scaled_radius_of_gyration.1'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
columnName = 'skewness_about'
threshold_low, outliers_low, threshold_high, outliers_high = determineAndReplaceOutliers(data_df[columnName],columnName)
numerical_cols = list(data_df.select_dtypes(exclude=['object']))
_= data_df.boxplot(column=numerical_cols, figsize = (20,20))
# Eye ball the data
data_df.describe().transpose()
plt.figure(figsize= (20,40))
columns_vars = data_df.drop('class',axis=1).columns #data_df.columns
pos = 1
for feature in columns_vars:
plt.subplot(10, 2, pos) # plot grid
sns.distplot( data_df[data_df['class'] == 'car'][feature], color = 'r')
sns.distplot( data_df[data_df['class'] == 'van'][feature], color = 'b')
sns.distplot( data_df[data_df['class'] == 'bus'][feature], color = 'g')
pos += 1 # to plot over the grid one by one
sns.pairplot(data_df,hue='class',diag_kind='kde')
plt.show()
data_df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(data_df.corr(),
annot=True,
linewidths=.5,
center=1,
cbar=False,
cmap="YlGnBu")
plt.show()
sns.countplot(data_df['class'])
plt.show()
data_df['class'].value_counts()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data_df['class'] = labelencoder.fit_transform(data_df['class'])
data_df['class'].value_counts()
# independant variables
X = data_df.drop('class',axis=1)
# the dependent variable
y = data_df['class']
print("shape of independent variable: ",X.shape)
print("shape of dependent variable: ",y.shape)
# scale independent attributes
from scipy.stats import zscore
X = X.apply(zscore)
X.head
covMatrix = np.cov(X,rowvar=False)
print(covMatrix)
from sklearn.decomposition import PCA
pca = PCA(n_components=18)
pca.fit(X)
# The eigen Values
print(pca.explained_variance_)
# The eigen Vectors
print(pca.components_)
# and the percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Sum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca8 = PCA(n_components=8)
pca8.fit(X)
print(pca8.components_)
print(pca8.explained_variance_ratio_)
Xpca8 = pca8.transform(X)
Xpca8
# plot pair graph using PCA 8 parameters intead of all 19
sns.pairplot(pd.DataFrame(Xpca8))
Lets construct two SVM models. The first with all the 18 independent variables and the second with only the 8 new variables constructed using PCA.
# split data in 70 / 30 of all 18 dimesions
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
X_train.shape,X_test.shape, y_train.shape, y_test.shape
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix
svm = SVC(gamma = 'auto')
model = svm
model.fit(X_train, y_train) # for the model with the train data
pred = model.predict(X_test) # make predictions on the test set
score_train = round(model.score(X_train, y_train), 3) # accuracy for train data
score = round(model.score(X_test, y_test), 3) # compute accuracy score for test set
print (score_train)
print (score)
print("Confusion matrix with raw data(18 dimension)\n",confusion_matrix(y_test,pred))
from sklearn.model_selection import GridSearchCV
from sklearn import svm
#making the instance
model=svm.SVC()
#Hyper Parameters Set
params = {'C': [0.01, 0.05, 0.5, 1],
# 'gamma':[0.01, 0.02 , 0.03 , 0.04, 0.05],
'kernel': ['linear','rbf']}
#Making models with hyper parameters sets
gs = GridSearchCV(model, param_grid=params, n_jobs=-1,cv=10)
gs.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",gs.best_params_)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds)
model = gs
results = cross_val_score(gs,X_train,y_train, cv=kfold)
print(results)
cross_val_score_raw = "for raw data (18 dimension) cross_val_score Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0)
print (cross_val_score_raw)
prediction=gs.predict(X_test)
print("Accuracy:",metrics.accuracy_score(prediction,y_test))
accuracy_score_raw = metrics.accuracy_score(prediction,y_test)
#evaluation(Confusion Metrix)
print("Confusion Matrix:\n",metrics.confusion_matrix(y_test,prediction))
# for PCA 8 dimensions
# split data in 70 / 30 of 8 dimesions
from sklearn.model_selection import train_test_split
X_train_pca8, X_test_pca8, y_train_pca8, y_test_pca8 = train_test_split(Xpca8, y, train_size=0.7, test_size=0.3, random_state=100)
X_train_pca8.shape,X_test_pca8.shape, y_train_pca8.shape, y_test_pca8.shape
svm = SVC(gamma = 'auto')
model = svm
model.fit(X_train_pca8, y_train_pca8) # for the model with the train data
pred_pca8 = model.predict(X_test_pca8) # make predictions on the test set
score_train_pca8 = round(model.score(X_train_pca8, y_train_pca8), 3) # accuracy for train data
score_pca8= round(model.score(X_test_pca8, y_test_pca8), 3) # compute accuracy score for test set
print (score_train_pca8)
print (score_pca8)
print("Confusion matrix with PCA data (8 dimension)\n",confusion_matrix(y_test_pca8,pred_pca8))
from sklearn.model_selection import GridSearchCV
from sklearn import svm
#making the instance
model=svm.SVC()
#Hyper Parameters Set
params = {'C': [0.01, 0.05, 0.5, 1],
# 'gamma':[0.01, 0.02 , 0.03 , 0.04, 0.05],
'kernel': ['linear','rbf']}
#Making models with hyper parameters sets
gs = GridSearchCV(model, param_grid=params, n_jobs=-1,cv=10)
gs.fit(X_train_pca8, y_train_pca8)
#The best hyper parameters set
print("Best Hyper Parameters:\n",gs.best_params_)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds)
model = gs
results = cross_val_score(gs,X_train_pca8,y_train_pca8, cv=kfold)
print(results)
cross_val_score_pca8 = "for pca (8 dimension) cross_val_score Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0)
print(cross_val_score_pca8)
prediction_pca8=gs.predict(X_test_pca8)
print("Accuracy:",metrics.accuracy_score(prediction_pca8,y_test_pca8))
accuracy_score_pca8 = metrics.accuracy_score(prediction_pca8,y_test_pca8)
#evaluation(Confusion Metrix)
print("Confusion Matrix:\n",metrics.confusion_matrix(y_test_pca8,prediction_pca8))
print(f'for raw data (18 dimension) accuracy_score: {accuracy_score_raw}\n{cross_val_score_raw}')
print(f'for pca (8 dimension) accuracy_score: {accuracy_score_pca8}\n{cross_val_score_pca8}')